library(tidyverse)
library(knitr)
library(janitor)
library(DT)Lab_5_Cooper.qmd
Kable
iris_setosa <- iris |>
filter(Species == "setosa") |>
filter(Sepal.Length > 5)library(knitr)
kable(iris_setosa)| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 5.4 | 3.7 | 1.5 | 0.2 | setosa |
| 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 5.7 | 4.4 | 1.5 | 0.4 | setosa |
| 5.4 | 3.9 | 1.3 | 0.4 | setosa |
| 5.1 | 3.5 | 1.4 | 0.3 | setosa |
| 5.7 | 3.8 | 1.7 | 0.3 | setosa |
| 5.1 | 3.8 | 1.5 | 0.3 | setosa |
| 5.4 | 3.4 | 1.7 | 0.2 | setosa |
| 5.1 | 3.7 | 1.5 | 0.4 | setosa |
| 5.1 | 3.3 | 1.7 | 0.5 | setosa |
| 5.2 | 3.5 | 1.5 | 0.2 | setosa |
| 5.2 | 3.4 | 1.4 | 0.2 | setosa |
| 5.4 | 3.4 | 1.5 | 0.4 | setosa |
| 5.2 | 4.1 | 1.5 | 0.1 | setosa |
| 5.5 | 4.2 | 1.4 | 0.2 | setosa |
| 5.5 | 3.5 | 1.3 | 0.2 | setosa |
| 5.1 | 3.4 | 1.5 | 0.2 | setosa |
| 5.1 | 3.8 | 1.9 | 0.4 | setosa |
| 5.1 | 3.8 | 1.6 | 0.2 | setosa |
| 5.3 | 3.7 | 1.5 | 0.2 | setosa |
DT
library(DT)
datatable(iris_setosa)Bound and Present by Datatable
datatable(
iris |>
filter(Species == "setosa") |>
filter(Sepal.Length > 5)
)Create New Object
iris_setosa <- iris |>
filter(Species == "setosa") |>
filter(Sepal.Length > 5)
datatable(iris_setosa)Neon Data Table
NEON_MAGs <- read_tsv("/work/pi_bio678_umass_edu/data_NEON/exported_img_bins_Gs0166454_NEON.tsv")head(NEON_MAGs)# A tibble: 6 × 21
bin_oid `Bin ID` `Genome Name` `IMG Genome ID` `Bin Quality` `Bin Lineage`
<chr> <chr> <chr> <dbl> <chr> <chr>
1 3300075492… 3300075… Soil microbi… 3300075492 MQ Bacteria
2 3300075492… 3300075… Soil microbi… 3300075492 MQ Bacteria
3 3300075492… 3300075… Soil microbi… 3300075492 MQ <NA>
4 3300075492… 3300075… Soil microbi… 3300075492 MQ Bacteria; Ac…
5 3300075492… 3300075… Soil microbi… 3300075492 MQ Bacteria; Ac…
6 3300075492… 3300075… Soil microbi… 3300075492 MQ Bacteria; Ac…
# ℹ 15 more variables: `GTDB Taxonomy Lineage` <chr>, `Bin Methods` <chr>,
# `Created By` <chr>, `Date Added` <date>, `Bin Completeness` <dbl>,
# `Bin Contamination` <dbl>, `Average Coverage` <lgl>,
# `Total Number of Bases` <dbl>, `5s rRNA` <dbl>, `16s rRNA` <dbl>,
# `23s rRNA` <dbl>, `tRNA Genes` <dbl>, `Gene Count` <dbl>,
# `Scaffold Count` <dbl>, `GOLD Study ID` <chr>
str(NEON_MAGs)spc_tbl_ [16,669 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ bin_oid : chr [1:16669] "3300075492_s0" "3300075492_s1" "3300075492_s100" "3300075492_s106" ...
$ Bin ID : chr [1:16669] "3300075492_s0" "3300075492_s1" "3300075492_s100" "3300075492_s106" ...
$ Genome Name : chr [1:16669] "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ ...
$ IMG Genome ID : num [1:16669] 3.3e+09 3.3e+09 3.3e+09 3.3e+09 3.3e+09 ...
$ Bin Quality : chr [1:16669] "MQ" "MQ" "MQ" "MQ" ...
$ Bin Lineage : chr [1:16669] "Bacteria" "Bacteria" NA "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales" ...
$ GTDB Taxonomy Lineage: chr [1:16669] "Bacteria; Acidobacteriota; Terriglobia; Acidoferrales; UBA7541; Acidoferrum" "Bacteria; Desulfobacterota_B; Binatia; Binatales; Binataceae; Binatus; Binatus soli" "Archaea; Thermoplasmatota; Thermoplasmata; UBA184; UBA184; UBA184" "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales; Solirubrobacteraceae; Palsa-744" ...
$ Bin Methods : chr [1:16669] "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" ...
$ Created By : chr [1:16669] "IMG_PIPELINE" "IMG_PIPELINE" "IMG_PIPELINE" "IMG_PIPELINE" ...
$ Date Added : Date[1:16669], format: "2025-01-23" "2025-01-23" ...
$ Bin Completeness : num [1:16669] 95.9 99.6 57.9 74 67.3 ...
$ Bin Contamination : num [1:16669] 6.31 0 2.39 8.19 3.95 6.54 5.24 0.63 0.61 4.56 ...
$ Average Coverage : logi [1:16669] NA NA NA NA NA NA ...
$ Total Number of Bases: num [1:16669] 5600425 3706224 1233791 2084993 3809196 ...
$ 5s rRNA : num [1:16669] 1 1 1 1 0 0 0 1 1 0 ...
$ 16s rRNA : num [1:16669] 2 0 0 1 0 0 0 0 0 0 ...
$ 23s rRNA : num [1:16669] 2 1 1 1 0 0 0 1 0 0 ...
$ tRNA Genes : num [1:16669] 57 51 30 33 31 33 29 25 30 30 ...
$ Gene Count : num [1:16669] 4976 3617 1344 2284 3848 ...
$ Scaffold Count : num [1:16669] 26 45 210 322 398 85 527 435 243 409 ...
$ GOLD Study ID : chr [1:16669] "Gs0166454" "Gs0166454" "Gs0166454" "Gs0166454" ...
- attr(*, "spec")=
.. cols(
.. bin_oid = col_character(),
.. `Bin ID` = col_character(),
.. `Genome Name` = col_character(),
.. `IMG Genome ID` = col_double(),
.. `Bin Quality` = col_character(),
.. `Bin Lineage` = col_character(),
.. `GTDB Taxonomy Lineage` = col_character(),
.. `Bin Methods` = col_character(),
.. `Created By` = col_character(),
.. `Date Added` = col_date(format = ""),
.. `Bin Completeness` = col_double(),
.. `Bin Contamination` = col_double(),
.. `Average Coverage` = col_logical(),
.. `Total Number of Bases` = col_double(),
.. `5s rRNA` = col_double(),
.. `16s rRNA` = col_double(),
.. `23s rRNA` = col_double(),
.. `tRNA Genes` = col_double(),
.. `Gene Count` = col_double(),
.. `Scaffold Count` = col_double(),
.. `GOLD Study ID` = col_character()
.. )
- attr(*, "problems")=<externalptr>
NEON_MAGs <- NEON_MAGs |> janitor::clean_names()NEON_MAGs |>
count(bin_quality, sort = TRUE) # A tibble: 2 × 2
bin_quality n
<chr> <int>
1 MQ 15369
2 HQ 1300
kable(
NEON_MAGs |>
count(bin_quality)
)| bin_quality | n |
|---|---|
| HQ | 1300 |
| MQ | 15369 |
datatable(
NEON_MAGs|>
filter(bin_quality == "HQ")
)kable(
NEON_MAGs |>
select(c(gtdb_taxonomy_lineage, total_number_of_bases)) |>
filter(total_number_of_bases > 10000000)
)| gtdb_taxonomy_lineage | total_number_of_bases |
|---|---|
| Bacteria; Pseudomonadota; Gammaproteobacteria; Steroidobacterales; Steroidobacteraceae; 13-2-20CM-66-19 | 10115899 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 11932805 |
| Bacteria; Actinomycetota; Actinomycetes; Streptomycetales; Catenulisporaceae; Catenulispora | 12420050 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae; JAJPJC01 | 13151516 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 12046820 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae; JAQGHR01 | 12217509 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 12942605 |
| Bacteria; Actinomycetota; Actinomycetes; Mycobacteriales; Pseudonocardiaceae; Actinophytocola | 11293845 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 14411455 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 10890912 |
| Bacteria; Planctomycetota; Planctomycetia; Pirellulales; JAICIG01; JAICLL01 | 11379784 |
| Bacteria; Acidobacteriota; Terriglobia; Bryobacterales; Bryobacteraceae; PALSA-243 | 10475466 |
| NA | 10248256 |
| Bacteria; Pseudomonadota; Alphaproteobacteria; Rhizobiales; Xanthobacteraceae | 10436042 |
| Bacteria; Planctomycetota; Planctomycetia; Gemmatales; Gemmataceae | 13087515 |
| Bacteria; Actinomycetota; Actinomycetes; Mycobacteriales; Pseudonocardiaceae; Actinophytocola | 10431101 |
| Bacteria; Myxococcota; Polyangia; Polyangiales; JAFGIB01 | 10030262 |
| Bacteria; Myxococcota; Polyangia; Polyangiales; Polyangiaceae; JANYGI01 | 12560711 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 11026377 |
| Bacteria; Planctomycetota; Planctomycetia; Gemmatales; Gemmataceae | 12479517 |
| Bacteria; Acidobacteriota; Terriglobia; Bryobacterales; Bryobacteraceae; Solibacter | 10617884 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae; JAQGHR01 | 10431455 |
| Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae | 12864765 |
| Bacteria; Actinomycetota; Actinomycetes; Mycobacteriales; Mycobacteriaceae; Mycobacterium | 10583130 |
datatable(
NEON_MAGs |>
filter(str_detect(gtdb_taxonomy_lineage, 'Bacteroidota'))
)datatable(
NEON_MAGs |>
filter(str_detect(genome_name, 'Yellowstone NP'))
)NEON_MAGs_tax <- NEON_MAGs |>
separate(gtdb_taxonomy_lineage, c("domain", "phylum", "class", "order", "family", "genus"), "; ", remove = FALSE) datatable(
NEON_MAGs_tax |>
count(phylum, sort = TRUE)
)NEON_MAGs_tax_sample <- NEON_MAGs_tax |>
# Get rid of the the common string "Soil microbial communities from "
mutate_at("genome_name", str_replace, "Terrestrial soil microbial communities from ", "") |>
# Use the first `-` to split the column in two
separate(genome_name, c("site","sample_name"), " - ") |>
# Get rid of the the common string "S-comp-1"
mutate_at("sample_name", str_replace, "-comp-1", "") |>
# separate the Sample Name into Site ID and plot info
separate(sample_name, c("site_ID","subplot.layer.date"), "_", remove = FALSE,) |>
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("subplot", "layer", "date"), "-") datatable(
NEON_MAGs_tax_sample |>
count(site, sort = TRUE)
)Exercises
Exercise 1
view(iris)
iris_subset <- iris[iris$Species == "virginica", ]
DT::datatable(iris_subset)Exercise 2
DT::datatable(NEON_MAGs[NEON_MAGs$`x16s_r_rna` >= 1, ])Exercise 3
DT::datatable(
NEON_MAGs[NEON_MAGs$gold_study_id == "Lower Tombigbee",
c("genome_name", "gtdb_taxonomy_lineage", "total_number_of_bases")]
)Exercise 4
DT::datatable(
NEON_MAGs[NEON_MAGs$gold_study_id == "Gs0166454", ] %>%
count(bin_lineage, name = "class_count")
)Exercise 5
DT::datatable(
NEON_MAGs[grepl("Actinomycetota", NEON_MAGs$gtdb_taxonomy_lineage), ] %>%
count(gold_study_id, name = "Actinomycetota_count")
)